This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(caret)
library(data.table)
library(plotly)
# attach iris
data(iris)
dt <- iris
# obtain 80% of the data set for training
validation_index <- createDataPartition(dt$Species, p = 0.8, list = FALSE)
# select 20% for validation and 80% for training
dtValidation <- dt[-validation_index, ] %>% data.table()
dtTest <- dt[validation_index, ] %>% data.table()
# dimensions
dim(dtTest)
[1] 120 5
# list attributes
sapply(dt, class)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
"numeric" "numeric" "numeric" "numeric" "factor"
# peek at dataset
head(dt)
# list the levels for the class
levels(dt$Species)
[1] "setosa" "versicolor" "virginica"
# class distribution of test data
percentage <- prop.table(table(dtTest$Species))*100
cbind(freq = table(dtTest$Species), percentage = percentage)
freq percentage
setosa 40 33.33333
versicolor 40 33.33333
virginica 40 33.33333
# statistical summary
summary(dtTest)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.200 Min. :1.000 Min. :0.100 setosa :40
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:40
Median :5.750 Median :3.000 Median :4.400 Median :1.350 virginica :40
Mean :5.826 Mean :3.057 Mean :3.758 Mean :1.211
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.700 Max. :2.500
# univariate plots: boxplot of each individual variable
dtPlot <- data.table(dtTest) %>% melt.data.table(, id.vars = "Species",
measure.vars = c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"),
variable.name = "Type", value.name = "Value")
dtPlot$Normalized <- (dtPlot$Value - min(dtPlot$Value))/(max(dtPlot$Value) - min(dtPlot$Value))
plot_ly(dtPlot, y = ~Normalized, color = ~Type, type = "box", boxpoints = "all", jitter = 0.3)
# bar plot of class breakdown
plot_ly(dtTest[, .N, keyby = Species], y = ~N, x = ~Species, type = "bar")
# multivariate plots: feature plot of attributes and color by class, box plot for each attribute by species
control <- trainControl(method = "cv", number = 10)
Warning messages:
1: package ‘ggplot2’ was built under R version 3.6.3
2: package ‘caret’ was built under R version 3.6.3
3: package ‘data.table’ was built under R version 3.6.3
4: package ‘plotly’ was built under R version 3.6.3
metric <- "Accuracy"
# Linear Discriminant Analysis (LDA)
set.seed(7)
fit.lda <- train(Species ~ ., data = dtTest, method = "lda", metric = metric, trControl = control)
# Classification and Regression Trees (CART)
set.seed(7)
fit.cart <- train(Species ~ ., data = dtTest, method = "rpart", metric = metric, trControl = control)
# k-Nearest Neighbors (kNN)
set.seed(7)
fit.knn <- train(Species ~ ., data = dtTest, method = "knn", metric = metric, trControl = control)
# Support Vector Machines (SVM) with a linear kernel
set.seed(7)
fit.svm <- train(Species ~ ., data = dtTest, method = "svmRadial", metric = metric, trControl = control)
1 package is needed for this model and is not installed. (kernlab). Would you like to try to install it now?
1: yes
2: no
yes
Installing package into ‘/home/jabagat/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
trying URL 'https://rstudiopm.santeecooper.com/prod-cran/__linux__/centos7/latest/src/contrib/kernlab_0.9-29.tar.gz'
Content type 'application/x-gzip' length 2457679 bytes (2.3 MB)
==================================================
downloaded 2.3 MB
* installing *binary* package ‘kernlab’ ...
* DONE (kernlab)
The downloaded source packages are in
‘/tmp/Rtmp2VujRU/downloaded_packages’
# Random Forest (RF)
set.seed(7)
fit.rf <- train(Species ~ ., data = dtTest, method = "rf", metric = metric, trControl = control)
1 package is needed for this model and is not installed. (randomForest). Would you like to try to install it now?
1: yes
2: no
yes
Installing package into ‘/home/jabagat/R/x86_64-pc-linux-gnu-library/3.6’
(as ‘lib’ is unspecified)
trying URL 'https://rstudiopm.santeecooper.com/prod-cran/__linux__/centos7/latest/src/contrib/randomForest_4.6-14.tar.gz'
Content type 'application/x-gzip' length 263356 bytes (257 KB)
==================================================
downloaded 257 KB
* installing *binary* package ‘randomForest’ ...
* DONE (randomForest)
The downloaded source packages are in
‘/tmp/Rtmp2VujRU/downloaded_packages’
# sumarize accuracy of the models
results <- resamples(list(lda = fit.lda, cart = fit.cart, knn = fit.knn, svm = fit.svm, rf = fit.rf))
summary(results)
Call:
summary.resamples(object = results)
Models: lda, cart, knn, svm, rf
Number of resamples: 10
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lda 0.9166667 1.0000000 1.0000000 0.9916667 1 1 0
cart 0.8333333 0.9375000 1.0000000 0.9583333 1 1 0
knn 0.9166667 0.9375000 1.0000000 0.9750000 1 1 0
svm 0.9166667 0.9166667 0.9583333 0.9583333 1 1 0
rf 0.8333333 0.9375000 1.0000000 0.9666667 1 1 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
lda 0.875 1.00000 1.0000 0.9875 1 1 0
cart 0.750 0.90625 1.0000 0.9375 1 1 0
knn 0.875 0.90625 1.0000 0.9625 1 1 0
svm 0.875 0.87500 0.9375 0.9375 1 1 0
rf 0.750 0.90625 1.0000 0.9500 1 1 0
# compare the accuracy of the models
dotplot(results)
# summarize the best model
print(fit.lda)
Linear Discriminant Analysis
120 samples
4 predictor
3 classes: 'setosa', 'versicolor', 'virginica'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 108, 108, 108, 108, 108, 108, ...
Resampling results:
Accuracy Kappa
0.9916667 0.9875
# estimate skill of LDA
predictions <- predict(fit.lda, dtValidation)
confusionMatrix(predictions, dtValidation$Species)
Confusion Matrix and Statistics
Reference
Prediction setosa versicolor virginica
setosa 10 0 0
versicolor 0 9 1
virginica 0 1 9
Overall Statistics
Accuracy : 0.9333
95% CI : (0.7793, 0.9918)
No Information Rate : 0.3333
P-Value [Acc > NIR] : 8.747e-12
Kappa : 0.9
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor Class: virginica
Sensitivity 1.0000 0.9000 0.9000
Specificity 1.0000 0.9500 0.9500
Pos Pred Value 1.0000 0.9000 0.9000
Neg Pred Value 1.0000 0.9500 0.9500
Prevalence 0.3333 0.3333 0.3333
Detection Rate 0.3333 0.3000 0.3000
Detection Prevalence 0.3333 0.3333 0.3333
Balanced Accuracy 1.0000 0.9250 0.9250